The world cup 2022 will be hosted by Qatar in 2022. We wanted to try
and predict which country has the biggest probability of winning the
tournament by using supervised learning.
2. EDA
# class of each variable
spec(input_data)
cols(
date = col_date(format = ""),
home_team = col_character(),
away_team = col_character(),
home_team_continent = col_character(),
away_team_continent = col_character(),
home_team_fifa_rank = col_double(),
away_team_fifa_rank = col_double(),
home_team_total_fifa_points = col_double(),
away_team_total_fifa_points = col_double(),
home_team_score = col_double(),
away_team_score = col_double(),
tournament = col_character(),
city = col_character(),
country = col_character(),
neutral_location = col_logical(),
shoot_out = col_character(),
home_team_result = col_character(),
home_team_goalkeeper_score = col_double(),
away_team_goalkeeper_score = col_double(),
home_team_mean_defense_score = col_double(),
home_team_mean_offense_score = col_double(),
home_team_mean_midfield_score = col_double(),
away_team_mean_defense_score = col_double(),
away_team_mean_offense_score = col_double(),
away_team_mean_midfield_score = col_double()
)
# summary
skim_without_charts(input_data)
── Data Summary ────────────────────────
Values
Name input_data
Number of rows 23921
Number of columns 25
_______________________
Column type frequency:
character 9
Date 1
logical 1
numeric 14
________________________
Group variables None
Missing data
input_data %>%
summarise_all(list(~is.na(.)))%>%
pivot_longer(everything(),
names_to = "variables", values_to="missing") %>%
count(variables, missing) %>%
ggplot(aes(y=variables,x=n,fill=missing))+
geom_col()+
scale_fill_manual(values=c("#A3BE8C","#EBCB8B"))+
theme(axis.title.y=element_blank())

Top 10 teams in 2022
# Get the ranking of all home teams
home <-
input_data %>%
select(date, home_team, home_team_fifa_rank) %>%
rename(team = home_team, ranking = home_team_fifa_rank)
# Get the ranking of all away teams
away <-
input_data %>%
select(date, away_team, away_team_fifa_rank) %>%
rename(team = away_team, ranking = away_team_fifa_rank)
# Combine both data frames into one
fifa_ranking <- rbind(home, away)
# Get the latest ranking of each country based on their most recent match
latest_fifa_ranking <-
fifa_ranking %>%
arrange(team, desc(date)) %>%
group_by(team) %>%
mutate(row_number = row_number(team)) %>%
filter(row_number == 1) %>%
select(-row_number, -date) %>%
arrange(ranking)
head(latest_fifa_ranking, 10)
FIFA rankings over time
top5_list <- head(latest_fifa_ranking, 5)$team
top5_ranking <-
fifa_ranking %>%
filter(team %in% top5_list)
p <-
ggplot(data = top5_ranking,
mapping = aes(
x = date,
y = ranking,
group = team,
color = team
)) +
geom_line() +
scale_y_reverse() +
labs(
x = "Date",
y = "FIFA Ranking",
color = "Team",
title = "FIFA Rankings of the 2022 Top 5 teams"
)
ggplotly(p)
NA
Teams with strongest GK
# Gather goalkeeper data from matches
gk_home <-
input_data %>%
select(date, home_team, home_team_goalkeeper_score) %>%
rename(team = home_team, goalkeeper_rating = home_team_goalkeeper_score)
gk_away <-
input_data %>%
select(date, away_team, away_team_goalkeeper_score) %>%
rename(team = away_team, goalkeeper_rating = away_team_goalkeeper_score)
gk_rating <- drop_na(rbind(gk_home, gk_away))
# Get latest rating of each team's goalkeeper and show top 10
latest_gk_rating <-
gk_rating %>%
arrange(team, desc(date)) %>%
group_by(team) %>%
mutate(row_number = row_number(team)) %>%
filter(row_number == 1) %>%
select(-row_number, -date) %>%
arrange(-goalkeeper_rating)
ggplot(data = head(latest_gk_rating, 10), mapping = aes(x=goalkeeper_rating, y=reorder(team, goalkeeper_rating), label=goalkeeper_rating)) +
geom_col(fill="#88C0D0") +
geom_text(position = position_stack(vjust = 0.5)) +
labs(title = "Top 10 teams with the strongest goalkeeper",
subtitle = "Based on the highest rated goalkeeper of each team",
x="Goalkeeper Rating",
y="Country")

Teams with strongest defense
# Gather goalkeeper and defense data from matches
def_home <-
input_data %>%
select(date, home_team, home_team_goalkeeper_score, home_team_mean_defense_score) %>%
rename(team = home_team, goalkeeper_rating = home_team_goalkeeper_score, mean_defense_rating = home_team_mean_defense_score)
def_away <-
input_data %>%
select(date, away_team, away_team_goalkeeper_score, away_team_mean_defense_score) %>%
rename(team = away_team, goalkeeper_rating = away_team_goalkeeper_score, mean_defense_rating = away_team_mean_defense_score)
def_rating <- drop_na(rbind(def_home, def_away))
# Get latest combined rating of each team and show top 10
latest_def_rating <-
def_rating %>%
arrange(team, desc(date)) %>%
mutate(total_def = goalkeeper_rating + mean_defense_rating) %>%
group_by(team) %>%
mutate(row_number = row_number(team)) %>%
filter(row_number==1) %>%
arrange(-total_def) %>%
select(-row_number, -date)
ggplot(data = head(latest_def_rating, 10), mapping=aes(x=total_def, y=reorder(team, total_def), label=total_def)) +
geom_col(fill="#88C0D0") +
geom_text(position = position_stack(vjust = 0.5)) +
labs(title = "Top 10 teams with the strongest defense",
subtitle = "Based on goalkeeper and mean defense ratings",
x = "Total Defense Rating",
y = "Teams")

Teams with strongest midfield
mid_home <-
input_data %>%
select(date, home_team, home_team_mean_midfield_score) %>%
rename(team = home_team, midfield_rating = home_team_mean_midfield_score)
mid_away <-
input_data %>%
select(date, away_team, away_team_mean_midfield_score) %>%
rename(team = away_team, midfield_rating = away_team_mean_midfield_score)
mid_rating <- drop_na(rbind(mid_home, mid_away))
# Get latest midfield rating of each team and show top 10
latest_mid_rating <-
mid_rating %>%
arrange(team, desc(date)) %>%
group_by(team) %>%
mutate(row_number = row_number(team)) %>%
filter(row_number == 1) %>%
arrange(-midfield_rating) %>%
select(-date, -row_number)
ggplot(data = head(latest_mid_rating, 10), mapping=aes(x=midfield_rating, y=reorder(team, midfield_rating), label=midfield_rating)) +
geom_col(fill= "#88C0D0") +
geom_text(position = position_stack(vjust = 0.5)) +
labs(title = "Top 10 teams with the strongest midfield",
subtitle = "Based on the average rating of the 4 highest rated midfield players of each team",
x = "Midfield Rating",
y = "Teams")

Teams with strongest offense
off_home <-
input_data %>%
select(date, home_team, home_team_mean_offense_score) %>%
rename(team = home_team, offense_rating = home_team_mean_offense_score)
off_away <-
input_data %>%
select(date, away_team, away_team_mean_offense_score) %>%
rename(team = away_team, offense_rating = away_team_mean_offense_score)
off_rating <- drop_na(rbind(off_home, off_away))
# Get latest offense rating of each team and show top 10
latest_off_rating <-
off_rating %>%
arrange(team, desc(date)) %>%
group_by(team) %>%
mutate(row_number = row_number(team)) %>%
filter(row_number == 1) %>%
arrange(-offense_rating) %>%
select(-date, -row_number)
ggplot(data = head(latest_off_rating, 10), mapping=aes(x=offense_rating, y=reorder(team, offense_rating), label=offense_rating)) +
geom_col(fill="#88C0D0") +
geom_text(position = position_stack(vjust = 0.5)) +
labs(title="Top 10 teams with the strongest offense",
subtitle="Based on the average rating of the 3 highest rated offensive players of each team",
x="Offense Rating",
y="Teams")

Is it better to play at home ?
home_team_advantage <-
input_data %>%
filter(neutral_location == FALSE) %>%
count(home_team_result) %>%
mutate(percentage = label_percent()(n/sum(n)))
ggplot(data = home_team_advantage, mapping=aes(x="", y=n, fill=home_team_result)) +
geom_bar(width = 1, stat = "identity", color="white") +
coord_polar("y") +
scale_fill_manual(values = c("#EBCB8B", "#BF616A",
"#A3BE8C")) +
theme_void() +
labs(title = "Distribution of match results of home teams",
subtitle = "Excluding matches played at neutral locations",
fill="Result")

Correlation Matrix
# select numeric columns only
input_numeric_data <- input_data %>%
select_if(is.numeric) %>%
drop_na()
# rename variables for easier correlation plot visualization
input_numeric_data <- input_numeric_data %>% rename(
rank1 = home_team_fifa_rank,
rank2 = away_team_fifa_rank,
total_fifa_points1 = home_team_total_fifa_points,
total_fifa_points2 = away_team_total_fifa_points,
score1 = home_team_score,
score2 = away_team_score,
gk_score1 = home_team_goalkeeper_score,
gk_score2 = away_team_goalkeeper_score,
df_score1 = home_team_mean_defense_score,
df_score2 = away_team_mean_defense_score,
att_score1 = home_team_mean_offense_score,
att_score2 = away_team_mean_offense_score,
mf_score1 = home_team_mean_midfield_score,
mf_score2 = away_team_mean_midfield_score
)
# create correlation plot
input_numeric_data %>%
cor() %>%
corrplot(
type = "upper",
diag = FALSE,
col=colorRampPalette(c("firebrick","lightyellow","green4"))(100),
method = "shade",
shade.col = NA,
tl.col = "black",
tl.srt = 45
)

2. Data Processing / Feature Engineering
Create new features
output_data$win <- output_data$score_difference > 0
Warning: Unknown or uninitialised column: `score_difference`.
Error:
! Assigned data `output_data$score_difference > 0` must be compatible with existing data.
✖ Existing data has 23921 rows.
✖ Assigned data has 0 rows.
ℹ Only vectors of size 1 are recycled.
Backtrace:
1. base::`$<-`(`*tmp*`, win, value = `<lgl>`)
12. tibble (local) `<fn>`(`<vctrs___>`)
Model
# create training and test set
sample <- sample(c(TRUE, FALSE), nrow(output_data), replace=TRUE, prob=c(0.7,0.3))
train <- output_data[sample, ]
test <- output_data[!sample, ]
# fit logistic regression model
logreg <- glm(win ~ average_rank + rank_diff + point_diff, family = "binomial", data = train)
summary(logreg)
Call:
glm(formula = win ~ average_rank + rank_diff + point_diff, family = "binomial",
data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.6642 -1.0028 -0.3857 1.0260 2.5995
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.2726775 0.0330011 -8.263 < 2e-16 ***
average_rank 0.0018669 0.0003617 5.161 2.46e-07 ***
rank_diff -0.0196047 0.0004886 -40.120 < 2e-16 ***
point_diff 0.0003186 0.0001322 2.410 0.016 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 23081 on 16657 degrees of freedom
Residual deviance: 19729 on 16654 degrees of freedom
AIC: 19737
Number of Fisher Scoring iterations: 4
# calc probability of win for each team in test dataset
predicted <- predict(logreg, test, type="response")
# convert wins to 1 and 0
test$win <- ifelse(test$win==TRUE, 1, 0)
# find optimal cutoff probability to use to maximize accuracy
optimal <- optimalCutoff(test$win, predicted)[1]
optimal
[1] 0.5171179
# confusion matrix
confusionMatrix(test$win, predicted)
# calculate miss classification error rate
misClassError(test$win, predicted, threshold = optimal)
[1] 0.3192
# ROC
plotROC(test$win, predicted)

Test
index(wc_rankings_away)
Error in index(wc_rankings_away) : could not find function "index"
row <- data.frame(matrix(nrow = 0, ncol = length(colnames(test)))) %>%
colnames(row) <- colnames(test)
Error in data.frame(matrix(nrow = 0, ncol = length(colnames(test)))) %>% :
target of assignment expands to non-language object
---
title: "World Cup 2022"
output:
  html_notebook: default
  pdf_document: default
---

```{r, echo=FALSE}
library(dplyr)
library(skimr)
library(plotly)
library(scales)
library(corrplot)
library(tidyverse)
library(gganimate)
library(InformationValue)
```

The world cup 2022 will be hosted by Qatar in 2022. We wanted to try and predict
which country has the biggest probability of winning the tournament by using supervised learning.

# 1. Data

The data that we will be using is already cleaned and does not require pre-processing.

The data set provides a complete overview of all international football matches played since the 90s. On top of that, the strength of each team is provided by incorporating the FIFA rankings as well as player strengths based on the EA Sport FIFA video game. It is available on [kaggle](https://www.kaggle.com/datasets/brenda89/fifa-world-cup-2022)

## Variables

- `date` : date of the match
- `home_team` : name of the home team
- `away_team` : name of the away team
- `home_team_continent` : continent of the home team
- `away_team_continent` : continent of the away team
- `home_team_fifa_rank` : FIFA rank of the home team when the match took place
- `away_team_fifa_rank` : FIFA rank of the away team when the match took place
- `home_team_total_fifa_points` : total number of FIFA points of the home team at the time of the match
- `away_team_total_fifa_points` : total number of FIFA points of the away team at the time of the match
- `home_team_score` : full-time home score (excluding penalty shootout)
- `away_team_score` : full-time away score (excluding penalty shootout)
- `tournament` : name of tournament
- `city` : name of the city where the match was played
- `country` : name of the country where the match was played
- `neutral_location` :
  - `TRUE` : the match was played at a neutral venue
- `shoot_out`:
  - `TRUE` : the match included a penalty shootout
- `home_team_result` : result of the home team (including penalty shootout)
- `home_team_goalkeeper_score` : FIFA game score of the highest ranked GK of the home team
- `away_team_goalkeeper_score` : FIFA game score of the highest ranked GK of the away team
- `home_team_mean_defense_score` : Average FIFA game score of the 4 highest ranked defensive players of the home team
- `away_team_mean_defense_score` : Average FIFA game score of the 4 highest ranked 
defensive players of the away team
- `home_team_mean_midfield_score` : Average FIFA game score of the 4 highest ranked midfield players of the home team
- `away_team_mean_midfield_score` : Average FIFA game score of the 4 highest ranked midfield players of the away team
- `home_team_mean_offense_score` : Average FIFA game score of the 3 highest ranked attacking players of the home team, including wing players
- `away_team_mean_offense_score` : Average FIFA game score of the 3 highest ranked attacking players of the away team, including wing players

# 2. EDA

```{r}
# class of each variable
spec(input_data)
```
```{r}
# summary
skim_without_charts(input_data)
```

## Missing data

```{r}
input_data %>%
  summarise_all(list(~is.na(.)))%>%
  pivot_longer(everything(),
               names_to = "variables", values_to="missing") %>%
  count(variables, missing) %>%
  ggplot(aes(y=variables,x=n,fill=missing))+
  geom_col()+
  scale_fill_manual(values=c("#A3BE8C","#EBCB8B"))+
  theme(axis.title.y=element_blank())
```


## Top 10 teams in 2022

```{r}
# Get the ranking of all home teams
home <-
  input_data %>% 
  select(date, home_team, home_team_fifa_rank) %>% 
  rename(team = home_team, ranking = home_team_fifa_rank)

# Get the ranking of all away teams
away <-
  input_data %>% 
  select(date, away_team, away_team_fifa_rank) %>% 
  rename(team = away_team, ranking = away_team_fifa_rank)

# Combine both data frames into one
fifa_ranking <- rbind(home, away)

# Get the latest ranking of each country based on their most recent match
latest_fifa_ranking <-
  fifa_ranking %>% 
  arrange(team, desc(date)) %>% 
  group_by(team) %>% 
  mutate(row_number = row_number(team)) %>% 
  filter(row_number == 1) %>% 
  select(-row_number, -date) %>% 
  arrange(ranking)
  
head(latest_fifa_ranking, 10)
```

## FIFA rankings over time

```{r}
top5_list <- head(latest_fifa_ranking, 5)$team

top5_ranking <-
  fifa_ranking  %>% 
  filter(team %in% top5_list)

p <-
  ggplot(data = top5_ranking,
         mapping = aes(
           x = date,
           y = ranking,
           group = team,
           color = team
         )) +
  geom_line() +
  scale_y_reverse() +
  labs(
    x = "Date",
    y = "FIFA Ranking",
    color = "Team",
    title = "FIFA Rankings of the 2022 Top 5 teams"
  )

ggplotly(p)

```

## Teams with strongest GK

```{r}
# Gather goalkeeper data from matches
gk_home <-
  input_data %>% 
  select(date, home_team, home_team_goalkeeper_score) %>% 
  rename(team = home_team, goalkeeper_rating = home_team_goalkeeper_score)

gk_away <-
  input_data %>% 
  select(date, away_team, away_team_goalkeeper_score) %>% 
  rename(team = away_team, goalkeeper_rating = away_team_goalkeeper_score)

gk_rating <- drop_na(rbind(gk_home, gk_away))

# Get latest rating of each team's goalkeeper and show top 10
latest_gk_rating <-
  gk_rating %>% 
  arrange(team, desc(date)) %>% 
  group_by(team) %>% 
  mutate(row_number = row_number(team)) %>% 
  filter(row_number == 1) %>% 
  select(-row_number, -date) %>% 
  arrange(-goalkeeper_rating)

ggplot(data = head(latest_gk_rating, 10), mapping = aes(x=goalkeeper_rating, y=reorder(team, goalkeeper_rating), label=goalkeeper_rating)) +
  geom_col(fill="#88C0D0") +
  geom_text(position = position_stack(vjust = 0.5)) +
  labs(title = "Top 10 teams with the strongest goalkeeper",
       subtitle = "Based on the highest rated goalkeeper of each team",
       x="Goalkeeper Rating",
       y="Country")
```
### Teams with strongest defense

```{r}
# Gather goalkeeper and defense data from matches
def_home <-
  input_data %>% 
  select(date, home_team, home_team_goalkeeper_score, home_team_mean_defense_score) %>% 
  rename(team = home_team, goalkeeper_rating = home_team_goalkeeper_score, mean_defense_rating = home_team_mean_defense_score)

def_away <-
  input_data %>% 
  select(date, away_team, away_team_goalkeeper_score, away_team_mean_defense_score) %>% 
  rename(team = away_team, goalkeeper_rating = away_team_goalkeeper_score, mean_defense_rating = away_team_mean_defense_score)

def_rating <- drop_na(rbind(def_home, def_away))

# Get latest combined rating of each team and show top 10
latest_def_rating <-
  def_rating %>% 
  arrange(team, desc(date)) %>% 
  mutate(total_def = goalkeeper_rating + mean_defense_rating) %>% 
  group_by(team) %>% 
  mutate(row_number = row_number(team)) %>% 
  filter(row_number==1) %>% 
  arrange(-total_def) %>% 
  select(-row_number, -date)

ggplot(data = head(latest_def_rating, 10), mapping=aes(x=total_def, y=reorder(team, total_def), label=total_def)) + 
  geom_col(fill="#88C0D0") +
  geom_text(position = position_stack(vjust = 0.5)) +
  labs(title = "Top 10 teams with the strongest defense",
       subtitle = "Based on goalkeeper and mean defense ratings",
       x = "Total Defense Rating",
       y = "Teams") 
```

### Teams with strongest midfield

```{r}
mid_home <-
  input_data %>% 
  select(date, home_team, home_team_mean_midfield_score) %>% 
  rename(team = home_team, midfield_rating = home_team_mean_midfield_score)

mid_away <-
  input_data %>% 
  select(date, away_team, away_team_mean_midfield_score) %>% 
  rename(team = away_team, midfield_rating = away_team_mean_midfield_score)

mid_rating <- drop_na(rbind(mid_home, mid_away))

# Get latest midfield rating of each team and show top 10
latest_mid_rating <-
  mid_rating %>% 
  arrange(team, desc(date)) %>% 
  group_by(team) %>% 
  mutate(row_number = row_number(team)) %>% 
  filter(row_number == 1) %>% 
  arrange(-midfield_rating) %>% 
  select(-date, -row_number)

ggplot(data = head(latest_mid_rating, 10), mapping=aes(x=midfield_rating, y=reorder(team, midfield_rating), label=midfield_rating)) + 
  geom_col(fill= "#88C0D0") +
  geom_text(position = position_stack(vjust = 0.5)) +
  labs(title = "Top 10 teams with the strongest midfield",
       subtitle = "Based on the average rating of the 4 highest rated midfield players of each team",
       x = "Midfield Rating",
       y = "Teams")
```

### Teams with strongest offense

```{r}
off_home <-
  input_data %>% 
  select(date, home_team, home_team_mean_offense_score) %>% 
  rename(team = home_team, offense_rating = home_team_mean_offense_score)

off_away <-
  input_data %>% 
  select(date, away_team, away_team_mean_offense_score) %>% 
  rename(team = away_team, offense_rating = away_team_mean_offense_score)

off_rating <- drop_na(rbind(off_home, off_away))

# Get latest offense rating of each team and show top 10
latest_off_rating <-
  off_rating %>% 
  arrange(team, desc(date)) %>% 
  group_by(team) %>% 
  mutate(row_number = row_number(team)) %>% 
  filter(row_number == 1) %>% 
  arrange(-offense_rating) %>% 
  select(-date, -row_number)

ggplot(data = head(latest_off_rating, 10), mapping=aes(x=offense_rating, y=reorder(team, offense_rating), label=offense_rating)) +
  geom_col(fill="#88C0D0") +
  geom_text(position = position_stack(vjust = 0.5)) +
  labs(title="Top 10 teams with the strongest offense",
      subtitle="Based on the average rating of the 3 highest rated offensive players of each team",
      x="Offense Rating",
      y="Teams")
```

## Is it better to play at home ?

```{r}
home_team_advantage <-
  input_data %>% 
  filter(neutral_location == FALSE) %>% 
  count(home_team_result) %>% 
  mutate(percentage = label_percent()(n/sum(n)))

ggplot(data = home_team_advantage, mapping=aes(x="", y=n, fill=home_team_result)) +
  geom_bar(width = 1, stat = "identity", color="white") +
  coord_polar("y") +
  scale_fill_manual(values = c("#EBCB8B", "#BF616A",
                               "#A3BE8C")) +
  theme_void() +
  labs(title = "Distribution of match results of home teams",
       subtitle = "Excluding matches played at neutral locations",
       fill="Result")
```

## Correlation Matrix

```{r}
# select numeric columns only
input_numeric_data <- input_data %>%
  select_if(is.numeric) %>%
  drop_na()

# rename variables for easier correlation plot visualization
input_numeric_data <- input_numeric_data %>% rename(
  rank1 = home_team_fifa_rank,
  rank2 = away_team_fifa_rank,
  total_fifa_points1 = home_team_total_fifa_points,
  total_fifa_points2 = away_team_total_fifa_points,
  score1 = home_team_score,
  score2 = away_team_score,
  gk_score1 = home_team_goalkeeper_score,
  gk_score2 = away_team_goalkeeper_score,
  df_score1 = home_team_mean_defense_score,
  df_score2 = away_team_mean_defense_score,
  att_score1 = home_team_mean_offense_score,
  att_score2 = away_team_mean_offense_score,
  mf_score1 = home_team_mean_midfield_score,
  mf_score2 = away_team_mean_midfield_score
)

# create correlation plot
input_numeric_data %>%
  cor() %>%
  corrplot(
    type = "upper",
    diag = FALSE,
    col=colorRampPalette(c("firebrick","lightyellow","green4"))(100),
    method = "shade",
    shade.col = NA,
    tl.col = "black",
    tl.srt = 45
  )
```

# 2. Data Processing / Feature Engineering

## Create new features

```{r}
output_data <- input_data

output_data$rank_diff <- output_data$home_team_fifa_rank - output_data$away_team_fifa_rank

output_data$average_rank <- (output_data$home_team_fifa_rank + output_data$away_team_fifa_rank)/2

output_data$point_diff <- output_data$home_team_total_fifa_points - output_data$away_team_total_fifa_points

output_data$score_diff <- output_data$home_team_score - output_data$away_team_score

output_data$win <- output_data$score_diff > 0

output_data$stake <- output_data$tournament != 'Friendly'
```

## Model

```{r}
# create training and test set
sample <-
  sample(c(TRUE, FALSE),
         nrow(output_data),
         replace = TRUE,
         prob = c(0.7, 0.3))
train <- output_data[sample,]
test <- output_data[!sample,]

# fit logistic regression model
logreg <-
  glm(win ~ average_rank + rank_diff + point_diff,
      family = "binomial",
      data = train)
summary(logreg)

# calc probability of win for each team in test dataset
predicted <- predict(logreg, test, type = "response")


# convert wins to 1 and 0
test$win <- ifelse(test$win == TRUE, 1, 0)

# find optimal cutoff probability to use to maximize accuracy
optimal <- optimalCutoff(test$win, predicted)[1]
optimal

# confusion matrix
confusionMatrix(test$win, predicted)

# calculate miss classification error rate
misClassError(test$win, predicted, threshold = optimal)

# ROC
plotROC(test$win, predicted)

```

## Test

```{r}
wc_teams <- list('Qatar', 'Ecuador', 'Senegal', 'Netherlands', 'England', 'Iran', 'USA',
                  'Wales', 'Argentina', 'Saudi Arabia', 'Mexico', 'Poland', 'France', 
                  'Australia', 'Denmark', 'Tunisia', 'Spain', 'Costa Rica', 'Germany', 
                  'Japan', 'Belgium', 'Canada', 'Morocco', 'Croatia', 'Brazil', 'Serbia', 
                  'Switzerland', 'Cameroon', 'Portugal', 'Ghana', 'Uruguay', 'South Korea')

wc_rankings_home <- output_data %>% 
  filter(date>"2013-01-01") %>% 
  select(home_team, home_team_fifa_rank, home_team_total_fifa_points) %>% 
  filter(home_team %in% wc_teams)
  
wc_rankings_away <- output_data %>% 
  filter(date>"2013-01-01") %>% 
  select(away_team, away_team_fifa_rank, away_team_total_fifa_points) %>% 
  filter(away_team %in% wc_teams)
```

```{r}
# prepare lists
simulation_winners <- list()
simulation_results_winners <- list()
simulation_results_round16 <- list()
simulation_df_round16 <- list()
simulation_results_quarterfina <- list()
simulation_df_quarterfinal <- list()
simulation_results_semifina <- list()
simulation_df_semifinal <- list()

# simulations
n = 1000

# select who will come out of the group stages
candidates <-
  list(
    'Senegal',
    'Netherlands',
    'England',
    'USA',
    'Argentina',
    'Poland',
    'France',
    'Denmark',
    'Spain',
    'Germany',
    'Belgium',
    'Croatia',
    'Brazil',
    'Serbia',
    'Portugal',
    'Uruguay'
  )
finals = list('round_of_16', 'quarterfinal', 'semifinal', 'final')

# simulate

for (f in finals){
  iterations <- length(candidates)/2
  winners = list()
  prob = list()
  
  for (i in range(iterations)){
    home <- candidates[i*2]
    away <- candidates[i*2+1]
    
    row <- data.frame(matrix(nrow = 0, ncol = length(colnames(test))))
    colnames(row) <- colnames(test)
    
    home_rank <- wc_rankings_home %>% filter
    
    
    
    
  }
}
  








```


















